In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from Orange.data import Table
In [5]:
from Orange.projection import PCA, SparsePCA, RandomizedPCA
data = Table('iris')
pca = PCA(n_components=2)
pca_model = pca(data)
transformed = pca_model(data)
# Percentage of variance explained for each component
print('Explained variance ratio (first two components): %s' % str(pca_model.explained_variance_ratio_))
target_names = data.domain.class_var.values
for c, i, target_name in zip('rgb', [0, 1, 2], target_names):
plt.scatter(transformed.X[data.Y == i, 0], transformed.X[data.Y == i, 1], c=c, label=target_name)
plt.legend(scatterpoints=1, loc=4)
plt.title('PCA of Iris dataset');
In [3]:
data = Table('ionosphere')
n_attributes = len(data.domain.attributes)
print('Original attributes: %d' % n_attributes)
print('Samples: %d' % len(data))
pca = PCA(n_components=6)
pca_model = pca(data)
# Plot the PCA spectrum
# Percentage of variance explained by each of the selected components
plt.plot(pca_model.explained_variance_ratio_, linewidth=2)
plt.xlabel('Components');
plt.ylabel('Explained variance ratio');
plt.xticks(np.arange(pca_model.n_components))
# The sum of explained variances of all components is equal to 1.0
# assert np.sum(pca_model.explained_variance_ratio_) > 0.99
# Reconstruction error
transformed = pca_model(data)
print('Reduced dimension: %d' % (len(transformed.domain.attributes)))
# Apply inverse PCA transform
# Transform data back to its original space, i.e., return an input
# ``data'' whose transform would be ``transformed''
reconstructed = np.dot(transformed.X, pca_model.components_) + pca_model.mean_
rec_err = np.linalg.norm(data.X - reconstructed)
# Q: What happens when the number of components is equal to the number of attributes
print('Frobenius distance: %5.2f' % rec_err)
In [4]:
# Inspired by http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_3d.html
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
# Generate the data
e = np.exp(1)
np.random.seed(4)
y = np.random.normal(scale=0.5, size=(30000))
x = np.random.normal(scale=0.5, size=(30000))
z = np.random.normal(scale=0.1, size=len(x))
a = x + y
b = 2 * y
c = a - b + z
norm = np.sqrt(a.var() + b.var())
a /= norm
b /= norm
data = Table(np.c_[a, b, c])
print('Original attributes: %d' % len(data.domain.attributes))
pca = PCA(n_components=2)
pca_model = pca(data)
pca_score = pca_model.explained_variance_ratio_
print('Components: %d' % pca_model.n_components)
print('Explained variance ratio: %5.2f' % np.sum(pca_score))
# Plot 3D figures
def plot_figs(fig_num, elev, azim):
fig = plt.figure(fig_num, figsize=(4, 3))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=elev, azim=azim)
ax.scatter(a[::10], b[::10], c[::10], marker='o', alpha=.4)
x_pca_axis, y_pca_axis, z_pca_axis = pca_model.components_.T * pca_score / pca_score.min()
x_pca_axis, y_pca_axis, z_pca_axis = 3 * pca_model.components_.T
x_pca_plane = np.r_[x_pca_axis[:2], - x_pca_axis[1::-1]]
y_pca_plane = np.r_[y_pca_axis[:2], - y_pca_axis[1::-1]]
z_pca_plane = np.r_[z_pca_axis[:2], - z_pca_axis[1::-1]]
x_pca_plane.shape = (2, 2)
y_pca_plane.shape = (2, 2)
z_pca_plane.shape = (2, 2)
ax.plot_surface(x_pca_plane, y_pca_plane, z_pca_plane)
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
elev = -40
azim = -80
plot_figs(1, elev, azim)
elev = 30
azim = 20
plot_figs(2, elev, azim)
plt.show()
In [5]:
# See Clustering notebook
In [6]:
from sklearn import datasets
n_class = 6
digits = datasets.load_digits(n_class=n_class)
print('Description: %s' % digits['DESCR'])
data = Table(digits.data, Y=digits.target)
print('Original attributes: %d' % len(data.domain.attributes))
print('Samples: %d' % len(data))
unique_labels = set(data.Y)
print('Classes: %s' % ', '.join(map(str, unique_labels)))
# Plot an examplar image of a handwritten digit
img = np.zeros((8, 8))
img = data.X[0].reshape((8, 8))
plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([]);
plt.yticks([]);
In [7]:
X_rnd = np.random.rand(len(data), 2)
colors = plt.cm.rainbow(np.linspace(0, 1, n_class))
x_min, x_max = np.min(X_rnd, 0), np.max(X_rnd, 0)
X = (X_rnd - x_min) / (x_max - x_min)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(data.Y[i]), color=colors[data.Y[i]], fontdict={'weight': 'bold', 'size': 9})
plt.xticks(())
plt.yticks(())
plt.tight_layout()
plt.show()
In [8]:
from Orange.projection.manifold import MDS
from Orange.distance import Euclidean
# MDS finds a low-dimensional representation of the datain which the distances respect
# well the distances in the original high-dimensional space
mds = MDS(dissimilarity=Euclidean)
mds_model = mds(data)
x_min, x_max = np.min(mds_model.embedding_, 0), np.max(mds_model.embedding_, 0)
X = (mds_model.embedding_ - x_min) / (x_max - x_min)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(data.Y[i]), color=colors[data.Y[i]], fontdict={'weight': 'bold', 'size': 9})
plt.xticks(())
plt.yticks(())
plt.tight_layout()
plt.show()
In [9]:
from Orange.projection import Isomap
n_neighbors = 30
isomap = Isomap(n_neighbors, n_components=2)
isomap_model = isomap(data)
x_min, x_max = np.min(isomap_model.embedding_, 0), np.max(isomap_model.embedding_, 0)
X = (isomap_model.embedding_ - x_min) / (x_max - x_min)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(data.Y[i]), color=colors[data.Y[i]], fontdict={'weight': 'bold', 'size': 9})
plt.xticks(())
plt.yticks(())
plt.tight_layout()
plt.show()
In [10]:
from Orange.projection import LocallyLinearEmbedding
n_neighbors = 30
lle = LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard')
lle_model = lle(data)
x_min, x_max = np.min(lle_model.embedding_, 0), np.max(lle_model.embedding_, 0)
X = (lle_model.embedding_ - x_min) / (x_max - x_min)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(data.Y[i]), color=colors[data.Y[i]], fontdict={'weight': 'bold', 'size': 9})
plt.xticks(())
plt.yticks(())
plt.tight_layout()
plt.show()
In [10]: